!pip install corpus

!pip install plotly --upgrade

!pip install spacy

In [1]:
import re
import string
import numpy as np 
import random
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords

from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/content/drive/MyDrive/AI/评论情感词提取/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
In [2]:
PATH = 'F:\\\\大三上\\文本挖掘技术\\data\\大作业\\'
In [3]:
def random_colours(number_of_colors):
    '''
    Simple function for random colours generation.
    Input:
        number_of_colors - integer value indicating the number of colours which are going to be generated.
    Output:
        Color in the following format: ['#E86DA4'] .
    '''
    colors = []
    for i in range(number_of_colors):
        colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors
In [4]:
train = pd.read_csv(PATH+'train.csv')
test = pd.read_csv(PATH+'test.csv')
#ss = pd.read_csv(PATH+'sample_submission.csv')
In [5]:
print(train.shape)
print(test.shape)
(21984, 5)
(5497, 3)
In [6]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21984 entries, 0 to 21983
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         21984 non-null  object
 1   text           21983 non-null  object
 2   selected_text  21983 non-null  object
 3   sentiment      21984 non-null  object
 4   id             21984 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 858.9+ KB
In [7]:
train.dropna(inplace=True)
In [8]:
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5497 entries, 0 to 5496
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         5497 non-null   int64 
 1   text       5497 non-null   object
 2   sentiment  5497 non-null   object
dtypes: int64(1), object(2)
memory usage: 129.0+ KB

EDA

In [9]:
train.head()
Out[9]:
textID text selected_text sentiment id
0 fa020570d4 Milk, Gran Torino, Bolt, Bride Wars, New In T... love positive 0
1 649e31adcc Starbucks I`m lovin` it Starbucks I`m lovin` it positive 1
2 8891d08a8c Ben and Jerry...yummmmy!!! .yummmmy! positive 2
3 94a3f5315a Hello, I see your online, can u talk to me pl... Hello, I see your online, can u talk to me ple... neutral 3
4 cb280b3adb _ kk`s have fun eating lol, remember shaun lo... fun positive 4
In [10]:
train.describe()
Out[10]:
id
count 21983.000000
mean 10991.321021
std 6346.467363
min 0.000000
25% 5495.500000
50% 10991.000000
75% 16487.500000
max 21983.000000
In [11]:
temp = train.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Purples')
Out[11]:
sentiment text
1 neutral 8922
2 positive 6843
0 negative 6218
In [12]:
plt.figure(figsize=(12,6))
sns.countplot(x='sentiment',data=train)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x14f5dde2bb0>
In [13]:
fig = go.Figure(go.Funnelarea(
    text =temp.sentiment,
    values = temp.text,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()
In [14]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))
In [15]:
results_jaccard=[]

for ind,row in train.iterrows():
    sentence1 = row.text
    sentence2 = row.selected_text

    jaccard_score = jaccard(sentence1,sentence2)
    results_jaccard.append([sentence1,sentence2,jaccard_score])
In [16]:
jaccard = pd.DataFrame(results_jaccard,columns=["text","selected_text","jaccard_score"])
train = train.merge(jaccard,how='outer')
In [17]:
train['Num_words_ST'] = train['selected_text'].apply(lambda x:len(str(x).split())) #Number Of words in Selected Text
train['Num_word_text'] = train['text'].apply(lambda x:len(str(x).split())) #Number Of words in main text
train['difference_in_words'] = train['Num_word_text'] - train['Num_words_ST'] #Difference in Number of words text and Selected Text
In [18]:
train.head()
Out[18]:
textID text selected_text sentiment id jaccard_score Num_words_ST Num_word_text difference_in_words
0 fa020570d4 Milk, Gran Torino, Bolt, Bride Wars, New In T... love positive 0 0.055556 1 19 18
1 649e31adcc Starbucks I`m lovin` it Starbucks I`m lovin` it positive 1 1.000000 4 4 0
2 8891d08a8c Ben and Jerry...yummmmy!!! .yummmmy! positive 2 0.000000 1 3 2
3 94a3f5315a Hello, I see your online, can u talk to me pl... Hello, I see your online, can u talk to me ple... neutral 3 1.000000 16 16 0
4 cb280b3adb _ kk`s have fun eating lol, remember shaun lo... fun positive 4 0.090909 1 11 10
In [19]:
hist_data = [train['Num_words_ST'],train['Num_word_text']]

group_labels = ['Selected_Text', 'Text']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels,show_curve=False)
fig.update_layout(title_text='Distribution of Number Of words')
fig.update_layout(
    autosize=False,
    width=900,
    height=700,
    paper_bgcolor="LightSteelBlue",
)
fig.show()
In [20]:
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train['Num_words_ST'], shade=True, color="r").set_title('Kernel Distribution of Number Of words')
p1=sns.kdeplot(train['Num_word_text'], shade=True, color="b")
In [21]:
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train[train['sentiment']=='positive']['difference_in_words'], shade=True, color="b").set_title('Kernel Distribution of Difference in Number Of words')
p2=sns.kdeplot(train[train['sentiment']=='negative']['difference_in_words'], shade=True, color="r")
In [22]:
plt.figure(figsize=(12,6))
sns.distplot(train[train['sentiment']=='neutral']['difference_in_words'],kde=False)
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x14f5f9dc820>
In [23]:
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train[train['sentiment']=='positive']['jaccard_score'], shade=True, color="b").set_title('KDE of Jaccard Scores across different Sentiments')
p2=sns.kdeplot(train[train['sentiment']=='negative']['jaccard_score'], shade=True, color="r")
plt.legend(labels=['positive','negative'])
Out[23]:
<matplotlib.legend.Legend at 0x14f5f4c04c0>
In [24]:
plt.figure(figsize=(12,6))
sns.distplot(train[train['sentiment']=='neutral']['jaccard_score'],kde=False)
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x14f5dde20d0>
In [25]:
k = train[train['Num_word_text']<=2]
In [26]:
k.groupby('sentiment').mean()['jaccard_score']
Out[26]:
sentiment
negative    0.805556
neutral     0.980450
positive    0.757292
Name: jaccard_score, dtype: float64
In [27]:
k[k['sentiment']=='positive']
Out[27]:
textID text selected_text sentiment id jaccard_score Num_words_ST Num_word_text difference_in_words
170 24bef2a55f awesome RONIN121 awesome positive 170 0.5 1 2 1
238 dbc8ab6a8c g`mornin g`mornin positive 238 1.0 1 1 0
267 515f4c5c4a Goodnight!!!!!! Goodnight! positive 267 0.0 1 1 0
399 2b562bcae4 Clever girl Clever positive 399 0.5 1 2 1
719 02e959833e Aw Yay Yay positive 719 0.5 1 2 1
... ... ... ... ... ... ... ... ... ...
20852 b72d063ad6 Good morning Good morning positive 20853 1.0 2 2 0
20889 0ccac3f8b7 sweet situations. sweet situations. positive 20890 1.0 2 2 0
21582 bc5a131667 HAPPY JUDDDAY HAPPY JUDDDAY positive 21583 1.0 2 2 0
21656 b4fa2d20e4 Good one Good one positive 21657 1.0 2 2 0
21950 a13175162b just relax just relax positive 21951 1.0 2 2 0

160 rows × 9 columns

清理语料库
在开始从文本和选定文本的单词中提取信息之前,让我们首先清理数据

In [28]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
In [29]:
train['text'] = train['text'].apply(lambda x:clean_text(x))
train['selected_text'] = train['selected_text'].apply(lambda x:clean_text(x))
In [30]:
train.head()
Out[30]:
textID text selected_text sentiment id jaccard_score Num_words_ST Num_word_text difference_in_words
0 fa020570d4 milk gran torino bolt bride wars new in town ... love positive 0 0.055556 1 19 18
1 649e31adcc starbucks im lovin it starbucks im lovin it positive 1 1.000000 4 4 0
2 8891d08a8c ben and jerryyummmmy yummmmy positive 2 0.000000 1 3 2
3 94a3f5315a hello i see your online can u talk to me plee... hello i see your online can u talk to me pleee... neutral 3 1.000000 16 16 0
4 cb280b3adb kks have fun eating lol remember shaun loves... fun positive 4 0.090909 1 11 10

Most Common words in our Target-Selected Text

In [31]:
train['temp_list'] = train['selected_text'].apply(lambda x:str(x).split())
top = Counter([item for sublist in train['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')
Out[31]:
Common_words count
0 i 5741
1 to 4243
2 the 3698
3 a 2827
4 my 2230
5 you 2094
6 and 1842
7 is 1734
8 it 1733
9 in 1586
10 for 1485
11 of 1317
12 im 1316
13 me 1241
14 on 1191
15 so 1135
16 have 1073
17 that 1030
18 but 1020
19 good 1003
In [32]:
fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in Selected Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

OOPS!While we cleaned our dataset we didnt remove the stop words and hence we can see the most coomon word is 'to' . Let's try again after removing the stopwords

In [33]:
nltk.download('stopwords')
[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     远程主机强迫关闭了一个现有的连接。>
Out[33]:
False
In [34]:
def remove_stopword(x):
    return [y for y in x if y not in stopwords.words('english')]
train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))
---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
d:\Anaconda3\lib\site-packages\nltk\corpus\util.py in __load(self)
     82                 try:
---> 83                     root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
     84                 except LookupError:

d:\Anaconda3\lib\site-packages\nltk\data.py in find(resource_name, paths)
    584     resource_not_found = "\n%s\n%s\n%s\n" % (sep, msg, sep)
--> 585     raise LookupError(resource_not_found)
    586 

LookupError: 
**********************************************************************
  Resource stopwords not found.
  Please use the NLTK Downloader to obtain the resource:

  >>> import nltk
  >>> nltk.download('stopwords')
  
  For more information see: https://www.nltk.org/data.html

  Attempted to load corpora/stopwords.zip/stopwords/

  Searched in:
    - 'C:\\Users\\YCDN/nltk_data'
    - 'd:\\Anaconda3\\nltk_data'
    - 'd:\\Anaconda3\\share\\nltk_data'
    - 'd:\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\YCDN\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


During handling of the above exception, another exception occurred:

LookupError                               Traceback (most recent call last)
<ipython-input-34-a148615a4b3b> in <module>
      1 def remove_stopword(x):
      2     return [y for y in x if y not in stopwords.words('english')]
----> 3 train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))

d:\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
   3846             else:
   3847                 values = self.astype(object).values
-> 3848                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   3849 
   3850         if len(mapped) and isinstance(mapped[0], Series):

pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()

<ipython-input-34-a148615a4b3b> in <lambda>(x)
      1 def remove_stopword(x):
      2     return [y for y in x if y not in stopwords.words('english')]
----> 3 train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))

<ipython-input-34-a148615a4b3b> in remove_stopword(x)
      1 def remove_stopword(x):
----> 2     return [y for y in x if y not in stopwords.words('english')]
      3 train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))

<ipython-input-34-a148615a4b3b> in <listcomp>(.0)
      1 def remove_stopword(x):
----> 2     return [y for y in x if y not in stopwords.words('english')]
      3 train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))

d:\Anaconda3\lib\site-packages\nltk\corpus\util.py in __getattr__(self, attr)
    118             raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
    119 
--> 120         self.__load()
    121         # This looks circular, but its not, since __load() changes our
    122         # __class__ to something new:

d:\Anaconda3\lib\site-packages\nltk\corpus\util.py in __load(self)
     83                     root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
     84                 except LookupError:
---> 85                     raise e
     86 
     87         # Load the corpus.

d:\Anaconda3\lib\site-packages\nltk\corpus\util.py in __load(self)
     78         else:
     79             try:
---> 80                 root = nltk.data.find("{}/{}".format(self.subdir, self.__name))
     81             except LookupError as e:
     82                 try:

d:\Anaconda3\lib\site-packages\nltk\data.py in find(resource_name, paths)
    583     sep = "*" * 70
    584     resource_not_found = "\n%s\n%s\n%s\n" % (sep, msg, sep)
--> 585     raise LookupError(resource_not_found)
    586 
    587 

LookupError: 
**********************************************************************
  Resource stopwords not found.
  Please use the NLTK Downloader to obtain the resource:

  >>> import nltk
  >>> nltk.download('stopwords')
  
  For more information see: https://www.nltk.org/data.html

  Attempted to load corpora/stopwords

  Searched in:
    - 'C:\\Users\\YCDN/nltk_data'
    - 'd:\\Anaconda3\\nltk_data'
    - 'd:\\Anaconda3\\share\\nltk_data'
    - 'd:\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\YCDN\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
In [35]:
top = Counter([item for sublist in train['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Purples')
Out[35]:
Common_words count
1 to 4243
2 the 3698
3 a 2827
4 my 2230
5 you 2094
6 and 1842
7 is 1734
8 it 1733
9 in 1586
10 for 1485
11 of 1317
12 im 1316
13 me 1241
14 on 1191
15 so 1135
16 have 1073
17 that 1030
18 but 1020
19 good 1003
In [36]:
# path=['Common_words']

# fig = px.treemap(temp, path, values=['count'],title='Tree of Most Common Words', color='Common_words')
# fig.show()

Most Common words in Text

In [37]:
train['temp_list1'] = train['text'].apply(lambda x:str(x).split()) #List of words in every row for text
train['temp_list1'] = train['temp_list1'].apply(lambda x:remove_stopword(x)) #Removing Stopwords
---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
d:\Anaconda3\lib\site-packages\nltk\corpus\util.py in __load(self)
     82                 try:
---> 83                     root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
     84                 except LookupError:

d:\Anaconda3\lib\site-packages\nltk\data.py in find(resource_name, paths)
    584     resource_not_found = "\n%s\n%s\n%s\n" % (sep, msg, sep)
--> 585     raise LookupError(resource_not_found)
    586 

LookupError: 
**********************************************************************
  Resource stopwords not found.
  Please use the NLTK Downloader to obtain the resource:

  >>> import nltk
  >>> nltk.download('stopwords')
  
  For more information see: https://www.nltk.org/data.html

  Attempted to load corpora/stopwords.zip/stopwords/

  Searched in:
    - 'C:\\Users\\YCDN/nltk_data'
    - 'd:\\Anaconda3\\nltk_data'
    - 'd:\\Anaconda3\\share\\nltk_data'
    - 'd:\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\YCDN\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


During handling of the above exception, another exception occurred:

LookupError                               Traceback (most recent call last)
<ipython-input-37-9c6800497ea9> in <module>
      1 train['temp_list1'] = train['text'].apply(lambda x:str(x).split()) #List of words in every row for text
----> 2 train['temp_list1'] = train['temp_list1'].apply(lambda x:remove_stopword(x)) #Removing Stopwords

d:\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
   3846             else:
   3847                 values = self.astype(object).values
-> 3848                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   3849 
   3850         if len(mapped) and isinstance(mapped[0], Series):

pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()

<ipython-input-37-9c6800497ea9> in <lambda>(x)
      1 train['temp_list1'] = train['text'].apply(lambda x:str(x).split()) #List of words in every row for text
----> 2 train['temp_list1'] = train['temp_list1'].apply(lambda x:remove_stopword(x)) #Removing Stopwords

<ipython-input-34-a148615a4b3b> in remove_stopword(x)
      1 def remove_stopword(x):
----> 2     return [y for y in x if y not in stopwords.words('english')]
      3 train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))

<ipython-input-34-a148615a4b3b> in <listcomp>(.0)
      1 def remove_stopword(x):
----> 2     return [y for y in x if y not in stopwords.words('english')]
      3 train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))

d:\Anaconda3\lib\site-packages\nltk\corpus\util.py in __getattr__(self, attr)
    118             raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
    119 
--> 120         self.__load()
    121         # This looks circular, but its not, since __load() changes our
    122         # __class__ to something new:

d:\Anaconda3\lib\site-packages\nltk\corpus\util.py in __load(self)
     83                     root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
     84                 except LookupError:
---> 85                     raise e
     86 
     87         # Load the corpus.

d:\Anaconda3\lib\site-packages\nltk\corpus\util.py in __load(self)
     78         else:
     79             try:
---> 80                 root = nltk.data.find("{}/{}".format(self.subdir, self.__name))
     81             except LookupError as e:
     82                 try:

d:\Anaconda3\lib\site-packages\nltk\data.py in find(resource_name, paths)
    583     sep = "*" * 70
    584     resource_not_found = "\n%s\n%s\n%s\n" % (sep, msg, sep)
--> 585     raise LookupError(resource_not_found)
    586 
    587 

LookupError: 
**********************************************************************
  Resource stopwords not found.
  Please use the NLTK Downloader to obtain the resource:

  >>> import nltk
  >>> nltk.download('stopwords')
  
  For more information see: https://www.nltk.org/data.html

  Attempted to load corpora/stopwords

  Searched in:
    - 'C:\\Users\\YCDN/nltk_data'
    - 'd:\\Anaconda3\\nltk_data'
    - 'd:\\Anaconda3\\share\\nltk_data'
    - 'd:\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\YCDN\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
In [38]:
top = Counter([item for sublist in train['temp_list1'] for item in sublist])
temp = pd.DataFrame(top.most_common(25))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')
Out[38]:
Common_words count
1 to 8027
2 the 7218
3 a 5360
4 my 4428
5 and 4016
6 you 3859
7 it 3255
8 is 3226
9 in 3029
10 for 2925
11 of 2527
12 im 2398
13 on 2276
14 me 2256
15 so 2070
16 have 2065
17 that 2060
18 but 1832
19 just 1782
20 with 1699
21 day 1617
22 be 1604
23 its 1589
24 at 1527
In [39]:
fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

Most common words Sentiments Wise

In [40]:
Positive_sent = train[train['sentiment']=='positive']
Negative_sent = train[train['sentiment']=='negative']
Neutral_sent = train[train['sentiment']=='neutral']
In [41]:
#MosT common positive words
top = Counter([item for sublist in Positive_sent['temp_list'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(20))
temp_positive.columns = ['Common_words','count']
temp_positive.style.background_gradient(cmap='Greens')
Out[41]:
Common_words count
0 i 811
1 good 672
2 happy 588
3 love 540
4 you 488
5 to 479
6 the 478
7 a 472
8 day 364
9 thanks 354
10 great 292
11 it 278
12 my 232
13 for 225
14 fun 224
15 is 224
16 nice 220
17 and 213
18 so 208
19 mothers 206
In [42]:
fig = px.bar(temp_positive, x="count", y="Common_words", title='Most Commmon Positive Words', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()
In [43]:
#MosT common negative words
top = Counter([item for sublist in Negative_sent['temp_list'] for item in sublist])
temp_negative = pd.DataFrame(top.most_common(20))
temp_negative = temp_negative.iloc[1:,:]
temp_negative.columns = ['Common_words','count']
temp_negative.style.background_gradient(cmap='Reds')
Out[43]:
Common_words count
1 to 461
2 the 439
3 my 417
4 a 375
5 im 364
6 not 334
7 is 306
8 so 297
9 miss 293
10 sad 275
11 it 265
12 me 245
13 sorry 237
14 in 208
15 and 198
16 you 195
17 that 188
18 bad 187
19 hate 180
In [44]:
# print(path)
In [45]:
# path=['Common_words']

# fig = px.treemap(temp_negative, path, values='count',title='Tree Of Most Common Negative Words')
# fig.show()
In [46]:
#MosT common Neutral words
top = Counter([item for sublist in Neutral_sent['temp_list'] for item in sublist])
temp_neutral = pd.DataFrame(top.most_common(20))
temp_neutral = temp_neutral.loc[1:,:]
temp_neutral.columns = ['Common_words','count']
temp_neutral.style.background_gradient(cmap='Reds')
Out[46]:
Common_words count
1 to 3303
2 the 2781
3 a 1980
4 my 1581
5 and 1431
6 you 1411
7 in 1261
8 is 1204
9 it 1190
10 for 1125
11 on 997
12 of 981
13 me 882
14 but 880
15 im 810
16 have 798
17 that 720
18 just 704
19 with 665
In [47]:
fig = px.bar(temp_neutral, x="count", y="Common_words", title='Most Commmon Neutral Words', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()
In [48]:
# path=['Common_words']

# fig = px.treemap(temp_neutral, path, values='count',title='Tree Of Most Common Neutral Words')
# fig.show()

Let's Look at Unique Words in each Segment

In [49]:
raw_text = [word for word_list in train['temp_list1'] for word in word_list]
In [50]:
def words_unique(sentiment,numwords,raw_words):
    '''
    Input:
        segment - Segment category (ex. 'Neutral');
        numwords - how many specific words do you want to see in the final result; 
        raw_words - list  for item in train_data[train_data.segments == segments]['temp_list1']:
    Output: 
        dataframe giving information about the name of the specific ingredient and how many times it occurs in the chosen cuisine (in descending order based on their counts)..

    '''
    allother = []
    for item in train[train.sentiment != sentiment]['temp_list1']:
        for word in item:
            allother .append(word)
    allother  = list(set(allother ))
    
    specificnonly = [x for x in raw_text if x not in allother]
    
    mycounter = Counter()
    
    for item in train[train.sentiment == sentiment]['temp_list1']:
        for word in item:
            mycounter[word] += 1
    keep = list(specificnonly)
    
    for word in list(mycounter):
        if word not in keep:
            del mycounter[word]
    
    Unique_words = pd.DataFrame(mycounter.most_common(numwords), columns = ['words','count'])
    
    return Unique_words

Positive Tweets

In [51]:
Unique_Positive= words_unique('positive', 20, raw_text)
print("The top 20 unique words in Positive Tweets are:")
Unique_Positive.style.background_gradient(cmap='Greens')
The top 20 unique words in Positive Tweets are:
Out[51]:
words count
0 congratulations 21
1 goodmorning 8
2 grateful 7
3 brilliant 7
4 presents 7
5 thnx 7
6 shared 7
7 lovin 6
8 greetings 6
9 honored 6
10 juddday 6
11 coolest 6
12 mothersday 5
13 inspiration 5
14 appreciated 5
15 mcr 5
16 brave 4
17 hurray 4
18 mint 4
19 curry 4
In [52]:
# path=['words']

# fig = px.treemap(Unique_Positive, path, values='count',title='Tree Of Unique Positive Words')
# fig.show()
In [59]:
from palettable.colorbrewer.qualitative import Pastel1_7
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.pie(Unique_Positive['count'], labels=Unique_Positive.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Positive Words')
plt.show()
In [ ]:
Unique_Negative= words_unique('negative', 10, raw_text)
print("The top 10 unique words in Negative Tweets are:")
Unique_Negative.style.background_gradient(cmap='Reds')
In [ ]:
from palettable.colorbrewer.qualitative import Pastel1_7
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(Unique_Negative['count'], labels=Unique_Negative.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Negative Words')
plt.show()
In [ ]:
Unique_Neutral= words_unique('neutral', 10, raw_text)
print("The top 10 unique words in Neutral Tweets are:")
Unique_Neutral.style.background_gradient(cmap='Oranges')
In [ ]:
from palettable.colorbrewer.qualitative import Pastel1_7
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.pie(Unique_Neutral['count'], labels=Unique_Neutral.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Neutral Words')
plt.show()

It's Time For WordClouds

In [ ]:
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(24.0,16.0), color = 'white',
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'u', "im"}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color=color,
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    width=400, 
                    height=200,
                    mask = mask)
    wordcloud.generate(str(text))
    
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask);
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
        plt.title(title, fontdict={'size': title_size,  
                                  'verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={'size': title_size, 'color': 'black', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()  
d = '/content/drive/MyDrive/masks-wordclouds/'
In [ ]:
pos_mask = np.array(Image.open(d+ 'comment.png'))
plot_wordcloud(Neutral_sent.text,mask=pos_mask,color='white',max_font_size=100,title_size=30,title="WordCloud of Neutral Tweets")
In [ ]:
plot_wordcloud(Positive_sent.text,mask=pos_mask,title="Word Cloud Of Positive tweets",title_size=30)
In [ ]:
plot_wordcloud(Negative_sent.text,mask=pos_mask,title="Word Cloud of Negative Tweets",color='white',title_size=30)

Modelling

1)Modelling the Problem as NER

In [ ]:
df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_submission = pd.read_csv(PATH+'sample_submission.csv')
In [ ]:
df_train['Num_words_text'] = df_train['text'].apply(lambda x:len(str(x).split())) #Number Of words in main Text in train set
In [ ]:
df_train = df_train[df_train['Num_words_text']>=3]
In [ ]:
def save_model(output_dir, nlp, new_model_name):
    ''' This Function Saves model to 
    given output directory'''
    
    output_dir = f'/content/drive/MyDrive/AI/评论情感词提取/{output_dir}'
    if output_dir is not None:        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
In [ ]:
# pass model = nlp if you want to train on top of existing model 

def train(train_data, output_dir, n_iter=50, model=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    ""
    if model is not None:
        nlp = spacy.load(output_dir)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()


        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 1000.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,  # batch of texts
                            annotations,  # batch of annotations
                            drop=0.9,   # dropout - make it harder to memorise data
                            losses=losses, 
                            )
            print("Losses", losses)
    save_model(output_dir, nlp, 'st_ner')
In [ ]:
def get_model_out_path(sentiment):
    '''
    Returns Model output path
    '''
    model_out_path = None
    if sentiment == 'positive':
        model_out_path = 'models/model_pos'
    elif sentiment == 'negative':
        model_out_path = 'models/model_neg'
    return model_out_path
In [ ]:
def get_training_data(sentiment):
    '''
    Returns Trainong data in the format needed to train spacy NER
    '''
    train_data = []
    for index, row in df_train.iterrows():
        if row.sentiment == sentiment:
            selected_text = row.selected_text
            text = row.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [[start, end, 'selected_text']]}))
    return train_data

Training models for Positive and Negative tweets

In [ ]:
sentiment = 'positive'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
# For DEmo Purposes I have taken 3 iterations you can train the model as you want
train(train_data, model_path, n_iter=50, model=None)
In [ ]:
sentiment = 'negative'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)

train(train_data, model_path, n_iter=50, model=None)

Predicting with the trained Model

In [ ]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return selected_text
In [ ]:
selected_texts = []
MODELS_BASE_PATH = '/content/drive/MyDrive/AI/评论情感词提取/models/'

if MODELS_BASE_PATH is not None:
    print("Loading Models  from ", MODELS_BASE_PATH)
    model_pos = spacy.load(MODELS_BASE_PATH + 'model_pos')
    model_neg = spacy.load(MODELS_BASE_PATH + 'model_neg')
        
    for index, row in df_test.iterrows():
        text = row.text
        output_str = ""
        if row.sentiment == 'neutral' or len(text.split()) <= 2:
            selected_texts.append(text)
        elif row.sentiment == 'positive':
            selected_texts.append(predict_entities(text, model_pos))
        else:
            selected_texts.append(predict_entities(text, model_neg))
        
df_test['selected_text'] = selected_texts
In [ ]:
df_submission['selected_text'] = df_test['selected_text']
df_submission['id'] = df_test['id']
df_submission.to_csv("/content/drive/MyDrive/AI/评论情感词提取/output/DA_n_iter50_drop0.9_bsz4_1000.csv", header=None, index=False, sep=',')
display(df_submission.head(10))
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: